From f7486d86420c3b7b189754ce220bb126f01f20a8 Mon Sep 17 00:00:00 2001
From: Aryeh Gregor <simetrical@users.mediawiki.org>
Date: Thu, 8 Jan 2009 23:59:25 +0000
Subject: [PATCH] Reduce code duplication correctly this time, again

The test cases I thought up are at:

http://www.mediawiki.org/wiki/User:Simetrical/Id_tests

All of them pass with the patch, except for some that fail on current
code as well: the ones involving templates, multiply-occurring section
headers, or numeric id's (there seems to be a weird bug with those that
probably involves string and numeric id's being used in the same array).
This is true whether $wgEnforceHtmlIds is on or off.  (Actually, the
problem with numeric keys doesn't happen with $wgEnforceHtmlIds off,
because of course numeric ids aren't allowed then.)
---
 includes/parser/Parser.php | 150 ++++++++++++++++++-------------------
 1 file changed, 73 insertions(+), 77 deletions(-)
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php
index 7fcfb90a35..1a4901c392 100644
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -3448,7 +3448,7 @@ class Parser
 	 * @private
 	 */
 	function formatHeadings( $text, $isMain=true ) {
-		global $wgMaxTocLevel, $wgContLang, $wgEnforceHtmlIds;
+		global $wgMaxTocLevel, $wgContLang;
 
 		$doNumberHeadings = $this->mOptions->getNumberHeadings();
 		$showEditLink = $this->mOptions->getEditSection();
@@ -3593,71 +3593,17 @@ class Parser
 				}
 			}
 
-			# The safe header is a version of the header text safe to use for links
-			# Avoid insertion of weird stuff like <math> by expanding the relevant sections
-			$safeHeadline = $this->mStripState->unstripBoth( $headline );
-
-			# Remove link placeholders by the link text.
-			#     <!--LINK number-->
-			# turns into
-			#     link text with suffix
-			$safeHeadline = $this->replaceLinkHoldersText( $safeHeadline );
-
-			# Strip out HTML (other than plain <sup> and <sub>: bug 8393)
-			$tocline = preg_replace(
-				array( '#<(?!/?(sup|sub)).*?'.'>#', '#<(/?(sup|sub)).*?'.'>#' ),
-				array( '',                          '<$1>'),
-				$safeHeadline
-			);
-			$tocline = trim( $tocline );
-
-			# For the anchor, strip out HTML-y stuff period
-			$safeHeadline = preg_replace( '/<.*?'.'>/', '', $safeHeadline );
-			$safeHeadline = trim( $safeHeadline );
-
-			# Save headline for section edit hint before it's escaped
-			$headlineHint = $safeHeadline;
-
-			if ( $wgEnforceHtmlIds ) {
-				$legacyHeadline = false;
-				$safeHeadline = Sanitizer::escapeId( $safeHeadline,
-					'noninitial' );
-			} else {
-				# For reverse compatibility, provide an id that's
-				# HTML4-compatible, like we used to.
-				#
-				# It may be worth noting, academically, that it's possible for
-				# the legacy anchor to conflict with a non-legacy headline
-				# anchor on the page.  In this case likely the "correct" thing
-				# would be to either drop the legacy anchors or make sure
-				# they're numbered first.  However, this would require people
-				# to type in section names like "abc_.D7.93.D7.90.D7.A4"
-				# manually, so let's not bother worrying about it.
-				$legacyHeadline = Sanitizer::escapeId( $safeHeadline,
-					'noninitial' );
-				$safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' );
-
-				if ( $legacyHeadline == $safeHeadline ) {
-					# No reason to have both (in fact, we can't)
-					$legacyHeadline = false;
-				} elseif ( $legacyHeadline != Sanitizer::escapeId(
-				$legacyHeadline, 'xml' ) ) {
-					# The legacy id is invalid XML.  We used to allow this, but
-					# there's no reason to do so anymore.  Backward
-					# compatibility will fail slightly in this case, but it's
-					# no big deal.
-					$legacyHeadline = false;
-				}
-			}
+			list( $anchor, $legacyAnchor, $tocline, $headlineHint ) =
+				$this->processHeadingText( $headline );
 
 			# HTML names must be case-insensitively unique (bug 10721).  FIXME:
 			# Does this apply to Unicode characters?  Because we aren't
 			# handling those here.
-			$arrayKey = strtolower( $safeHeadline );
-			if ( $legacyHeadline === false ) {
+			$arrayKey = strtolower( $anchor );
+			if ( $legacyAnchor === false ) {
 				$legacyArrayKey = false;
 			} else {
-				$legacyArrayKey = strtolower( $legacyHeadline );
+				$legacyArrayKey = strtolower( $legacyAnchor );
 			}
 
 			# count how many in assoc. array so we can track dupes in anchors
@@ -3679,12 +3625,10 @@ class Parser
 			}
 
 			# Create the anchor for linking from the TOC to the section
-			$anchor = $safeHeadline;
-			$legacyAnchor = $legacyHeadline;
 			if ( $refers[$arrayKey] > 1 ) {
 				$anchor .= '_' . $refers[$arrayKey];
 			}
-			if ( $legacyHeadline !== false && $refers[$legacyArrayKey] > 1 ) {
+			if ( $legacyAnchor !== false && $refers[$legacyArrayKey] > 1 ) {
 				$legacyAnchor .= '_' . $refers[$legacyArrayKey];
 			}
 			if( $enoughToc && ( !isset($wgMaxTocLevel) || $toclevel<$wgMaxTocLevel ) ) {
@@ -3756,6 +3700,70 @@ class Parser
 		}
 	}
 
+	private function processHeadingText( $headline ) {
+		global $wgEnforceHtmlIds;
+
+		# The safe header is a version of the header text safe to use for links
+		# Avoid insertion of weird stuff like <math> by expanding the relevant sections
+		$safeHeadline = $this->mStripState->unstripBoth( $headline );
+
+		# Remove link placeholders by the link text.
+		#     <!--LINK number-->
+		# turns into
+		#     link text with suffix
+		$safeHeadline = $this->replaceLinkHoldersText( $safeHeadline );
+
+		# Strip out HTML (other than plain <sup> and <sub>: bug 8393)
+		$tocline = preg_replace(
+			array( '#<(?!/?(sup|sub)).*?'.'>#', '#<(/?(sup|sub)).*?'.'>#' ),
+			array( '',                          '<$1>'),
+			$safeHeadline
+		);
+		$tocline = trim( $tocline );
+
+		# For the anchor, strip out HTML-y stuff period
+		$safeHeadline = preg_replace( '/<.*?'.'>/', '', $safeHeadline );
+		$safeHeadline = trim( $safeHeadline );
+
+		# Save headline for section edit hint before it's escaped
+		$headlineHint = $safeHeadline;
+
+		if ( $wgEnforceHtmlIds ) {
+			$legacyHeadline = false;
+			$safeHeadline = Sanitizer::escapeId( $safeHeadline,
+				'noninitial' );
+		} else {
+			# For reverse compatibility, provide an id that's
+			# HTML4-compatible, like we used to.
+			#
+			# It may be worth noting, academically, that it's possible for
+			# the legacy anchor to conflict with a non-legacy headline
+			# anchor on the page.  In this case likely the "correct" thing
+			# would be to either drop the legacy anchors or make sure
+			# they're numbered first.  However, this would require people
+			# to type in section names like "abc_.D7.93.D7.90.D7.A4"
+			# manually, so let's not bother worrying about it.
+			$legacyHeadline = Sanitizer::escapeId( $safeHeadline,
+				'noninitial' );
+			$safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' );
+
+			if ( $legacyHeadline == $safeHeadline ) {
+				# No reason to have both (in fact, we can't)
+				$legacyHeadline = false;
+			} elseif ( $legacyHeadline != Sanitizer::escapeId(
+			$legacyHeadline, 'xml' ) ) {
+				# The legacy id is invalid XML.  We used to allow this, but
+				# there's no reason to do so anymore.  Backward
+				# compatibility will fail slightly in this case, but it's
+				# no big deal.
+				$legacyHeadline = false;
+			}
+		}
+
+		return array( $safeHeadline, $legacyHeadline, $tocline,
+			$headlineHint );
+	}
+
 	/**
 	 * Transform wiki markup when saving a page by doing \r\n -> \n
 	 * conversion, substitting signatures, {{subst:}} templates, etc.
@@ -4736,21 +4744,9 @@ class Parser
 	 * "== Header ==".
 	 */
 	public function guessSectionNameFromWikiText( $text ) {
-		# Strip out wikitext links(they break the anchor)
 		$text = $this->stripSectionName( $text );
-		$headline = Sanitizer::decodeCharReferences( $text );
-		# strip out HTML
-		$headline = StringUtils::delimiterReplace( '<', '>', '', $headline );
-		$headline = trim( $headline );
-		$sectionanchor = '#' . urlencode( str_replace( ' ', '_', $headline ) );
-		$replacearray = array(
-			'%3A' => ':',
-			'%' => '.'
-		);
-		return str_replace(
-			array_keys( $replacearray ),
-			array_values( $replacearray ),
-			$sectionanchor );
+		list( $text, /* unneeded here */ ) = $this->processHeadingText( $text );
+		return "#$text";
 	}
 
 	/**
-- 
2.20.1